library(psych)
library(ggplot2)
G3;
Adjuntando el paquete: ‘ggplot2’
gG3;The following objects are masked from ‘package:psych’:
%+%, alpha
g
library(readxl)#to see the database
library(openxlsx)#to save the database in excel
library(dplyr) # for data manipulation
G3;
Adjuntando el paquete: ‘dplyr’
gG3;The following objects are masked from ‘package:stats’:
filter, lag
gG3;The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
g
library(tidyr) # for transforming datasets to long and wide formats
library(ordinal) # for ordinal logistic regression
G3;
Adjuntando el paquete: ‘ordinal’
gG3;The following object is masked from ‘package:dplyr’:
slice
g
library(car) # for VIF calculations
G3;Cargando paquete requerido: carData
gG3;
Adjuntando el paquete: ‘car’
gG3;The following object is masked from ‘package:dplyr’:
recode
gG3;The following object is masked from ‘package:psych’:
logit
g
library(vcd) # for visualizing relations between factors
G2;H2;Avisoh: package ‘vcd’ was built under R version 4.5.1g
G3;Cargando paquete requerido: grid
g
library(VGAM)
G2;H2;Avisoh: package ‘VGAM’ was built under R version 4.5.1g
G3;Cargando paquete requerido: stats4
gG3;Cargando paquete requerido: splines
gG3;
Adjuntando el paquete: ‘VGAM’
gG3;The following object is masked from ‘package:car’:
logit
gG3;The following objects are masked from ‘package:ordinal’:
dgumbel, dlgamma, pgumbel, plgamma, qgumbel, rgumbel, wine
gG3;The following objects are masked from ‘package:psych’:
fisherz, logistic, logit
g
library(coefplot) # for plots about coefficients
G2;H2;Avisoh: package ‘coefplot’ was built under R version 4.5.1g
Variables <- c(“gender”, “age”, “method”, “country”, “reason”, “nps_score”, “satisfaction”, “complain.connect”, “complain.respond.vel”, “complain.solve.vel”, “complain.repeat” )
set.seed(1000)
data.n <- 5*4*2*1000
data <- data.frame (id = as.factor(c(1:data.n)))
data$age <- round(rlnorm(n=data.n, meanlog= log(40), sdlog=log(1.4)))
data$age [data$age>90] <- 90
data$age [data$age<15] <- 15
data$gender <- factor(sample(c("man", "woman"), prob= c(0.5, 0.5), replace= TRUE, size = data.n))
data$country <- factor(sample(c("Drinkland","Eatland"), prob= c(0.5, 0.5), replace= TRUE, size = data.n))
data$method <- factor(sample(c("chat", "web_form", "email", "phone"), prob= c(0.4, 0.10,0.15, 0.35), replace= TRUE, size = data.n))
data$reason <- factor(sample(c("incorrect_item", "delay", "status","cancellation", "other"), prob= c(0.2, 0.3,0.2,0.1, 0.2), replace= TRUE, size = data.n))
data$open.comment <- rep("bla bla", data.n)
We will simulate the presence of complaints about the connection (complain.connect), the speed to respond (complain.sp.respond), the speed to solve the problem (complain.sp.solve) and complains about repeating information (complain.repeat), attending to several factors:
For each type of complaint we will set an initial probability, then we will alter that probability depending on the presence of the mentioned factors.
##For connection complaints
# we set an initial probability of 0.1 of the complaints to appear
con.prob <- rep(0.1,data.n)
# we set a double probability of connection complains in Drinkland
con.prob [data$country=="Drinkland"] <- con.prob [data$country=="Drinkland"] *2
#We ensure the probabilities are contained between 0 and 1 (for prevention in further transformations)
con.prob[con.prob>1] <- 1
con.prob[con.prob<0] <- 0
#we create the variable attending the above probabilities for each of our cases
data$complain.connect<- rbinom(n=data.n, size =1, prob=con.prob)
#to check the results are as expected:
with(data,prop.table(table(complain.connect,country), margin=2))
country
complain.connect Drinkland Eatland
0 0.79877380 0.90009028
1 0.20122620 0.09990972
##For complaints about the speed to respond
# we set an initial probability of 0.05 of the complaints to appear
spr.prob <- rep(0.05,data.n)
# we set a 6 times higher probability for complains to appear for email
spr.prob [data$method=="email"] <- spr.prob [data$method=="email"] *6
#We set a 3 times higher probability for complaints to appear for chat and web forms
spr.prob [data$method %in% c("chat", "web_form")] <- spr.prob [data$method %in% c("chat", "web_form")] *3
#We ensure the probabilities are contained between 0 and 1 (for prevention in further transformations)
spr.prob[spr.prob>1] <- 1
spr.prob[spr.prob<0] <- 0
#we create the variable attending the above probabilities for each of our cases
data$complain.sp.respond<- rbinom(n=data.n, size =1, prob=spr.prob)
#to check the results are as expected:
with(data,prop.table(table(complain.sp.respond,method),margin=2))
method
complain.sp.respond chat email phone web_form
0 0.8513143 0.6948590 0.9489453 0.8577970
1 0.1486857 0.3051410 0.0510547 0.1422030
##For complaints about the speed to solve
# we set an initial probability of 0.1 of the complaints to appear
sps.prob <- rep(0.1,data.n)
# we set a 5 times higher probability for contact reasons of incorrect items that are attended by chat
sps.prob [data$method=="chat" & data$reason=="incorrect_item"] <- sps.prob [data$method=="chat" & data$reason=="incorrect_item"] *5
#We ensure the probabilities are contained between 0 and 1 (for prevention in further transformations)
sps.prob[sps.prob>1] <- 1
sps.prob[sps.prob<0] <- 0
#we create the variable attending the above probabilities for each of our cases
data$complain.sp.solve<- rbinom(n=data.n, size =1, prob=sps.prob)
#to check the results are as expected:
with(data,prop.table(table(complain.sp.solve,reason,method),margin=c(2:3)))
, , method = chat
reason
complain.sp.solve cancellation delay incorrect_item other status
0 0.89769976 0.90130873 0.48367030 0.89497857 0.89202454
1 0.10230024 0.09869127 0.51632970 0.10502143 0.10797546
, , method = email
reason
complain.sp.solve cancellation delay incorrect_item other status
0 0.90252101 0.89886364 0.89542484 0.91181364 0.90712570
1 0.09747899 0.10113636 0.10457516 0.08818636 0.09287430
, , method = phone
reason
complain.sp.solve cancellation delay incorrect_item other status
0 0.89528433 0.90653753 0.89816481 0.89511238 0.90674673
1 0.10471567 0.09346247 0.10183519 0.10488762 0.09325327
, , method = web_form
reason
complain.sp.solve cancellation delay incorrect_item other status
0 0.91361257 0.89368771 0.90847914 0.88372093 0.88280255
1 0.08638743 0.10631229 0.09152086 0.11627907 0.11719745
##For complaints about the need to repeat information
# we set an initial probability of 0.02 of the complaints to appear
rp.prob <- rep(0.02,data.n)
# we set a 10 times higher probability for contact reasons about cancellations
rp.prob [data$reason=="cancellation"] <- rp.prob [data$reason=="cancellation"] *10
#We ensure the probabilities are contained between 0 and 1 (for prevention in further transformations)
rp.prob[rp.prob>1] <- 1
rp.prob[rp.prob<0] <- 0
#we create the variable attending the above probabilities for each of our cases
data$complain.repeat<- rbinom(n=data.n, size =1, prob=rp.prob)
#to check the results are as expected:
with(data,prop.table(table(complain.repeat,reason),margin=2))
reason
complain.repeat cancellation delay incorrect_item other status
0 0.81061164 0.97864738 0.97990202 0.98145401 0.98006154
1 0.18938836 0.02135262 0.02009798 0.01854599 0.01993846
##For other complaints
data$complain.other<- rbinom(n=data.n, size =1, prob=.2)
#to check the results are as expected:
with(data,prop.table(table(complain.other)))
complain.other
0 1
0.79655 0.20345
##Simulation of NPS scores
We first assume a normal distribution of responses in the ideal case
in which customers have no complaints.
Then we apply the corrections based on the information we have simulated
about the presence of different complaints.
At the end, we limit the scores to 0 and 10 and transform them to integers, in line with restrictions of NPS scores.
#Preliminary distribution in case of no complaints. We use a high mean value
data$nps.score <- rnorm(data.n, mean=9, sd= 2)
#We adjust the scores for the presence of complaints. We are applying fixed penalties to NPS scores based on the presence of complaints (e.g., 3 points if there are connection complaints).While using a binomial distribution could add randomness (and realism), we opt for fixed values to make the relationships easier to interpret.
data$nps.score <- data$nps.score -
3 * data$complain.connect -
4 * data$complain.sp.respond -
6 * data$complain.sp.solve -
2 * data$complain.repeat -
1 * data$complain.other
# to limit the values between 0 and 10:
data$nps.score[data$nps.score<0] <- 0
data$nps.score[data$nps.score>10] <- 10
# to convert to integer values
data$nps.score <- floor (data$nps.score)
#to see the distribution of scores
barplot(table(data$nps.score))
with(data,(prop.table(table(data$nps.score))))
0 1 2 3 4 5 6 7 8 9 10
0.062575 0.030050 0.042075 0.053425 0.067700 0.084100 0.106525 0.124975 0.131250 0.120450 0.176875
data$nps.segment <- rep(NA_character_, data.n)
data$nps.segment [data$nps.score<=6] <- "detractor"
data$nps.segment [data$nps.score >6 & data$nps.score <9] <- "passive"
data$nps.segment [data$nps.score>=9] <- "promoter"
data$nps.segment <- factor (data$nps.segment)
#to check the results
with(data, prop.table(table(nps.segment)))
nps.segment
detractor passive promoter
0.446450 0.256225 0.297325
##Simulation of satisfaction scores
data$satisfaction <- floor(rnorm(data.n, mean = 3, sd=0.5) + 0.9 * (1 + as.numeric(scale(data$nps.score))))
data$satisfaction[data$satisfaction<1] <- 1
data$satisfaction[data$satisfaction>5] <- 5
# To see the distribution
barplot(table(data$satisfaction))
with(data,prop.table(table(satisfaction)))
satisfaction
1 2 3 4 5
0.04970 0.15285 0.28840 0.36965 0.13940
# To check the correlation with NPS
with(data, cor(nps.score,satisfaction, method="spearman"))
[1] 0.8293446
##Simulation of missing data
###Missing data for satisfaction
We simulate that satisfaction was not available for the country Drinkland, and for the contact methods of email and webform. Additionally, within those situations in which the satisfaction item was available, there is 10% probability that customers do not answer to it.
data$satisfaction[data$country=="Drinkland"]<- NA_real_
data$satisfaction[data$method %in% c("email","web_form")]<- NA_real_
na_randomizer <- sample(c(0,1), size=sum(!is.na(data$satisfaction)), replace=TRUE, prob= c(0.1,0.9))
data$satisfaction[which(!is.na(data$satisfaction))[ na_randomizer==0]]<- NA_real_
#We see the results
summary(data$satisfaction)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
1.00 3.00 4.00 3.46 4.00 5.00 26557
Our main goal was to understand customer satisfaction. But, we didn’t have satisfaction data readily available for all countries and contact methods. Given this limitation, and because our key stakeholders were already familiar with it, we decided to use the Net Promoter Score (NPS) as our primary indicator.
NPS measures customer loyalty and how likely they are to recommend our services. It gave us a consistent way to evaluate customer perception across all the different areas we were interested in. For this analysis, NPS acted as a good proxy for overall customer experience and satisfaction.
To make sure NPS was a reliable stand-in for satisfaction, I took a few key steps:
I checked for a strong correlation between satisfaction scores and NPS scores. I looked at whether the satisfaction score was correlated with both, the raw scores (the 0-10 scale used for NPS) and the categorical scores (the percentage of Promoters minus Detractors for NPS).
For cases where we did have satisfaction data, I ran parallel analyses using those scores. This helped me see if the trends and insights matched what we found with NPS, or if any new patterns emerged.
We first explored the distribution shapes for both satisfaction and nps scores, using bar charts from the ggplot2 package:
# Distribution of the NPS scores:
ggplot(data[!is.na(data$nps.score), ], aes(x = factor(nps.score))) +
geom_bar(fill = "#2980b9") +
labs(
title = "Distribution of NPS Scores",
x = "NPS Score",
y = "Count"
) +
theme_minimal()
# Distribution of satisfaction scores:
ggplot(data[!is.na(data$satisfaction), ], aes(x = factor(satisfaction))) +
geom_bar(fill = "#27ae60") +
labs(
title = "Distribution of Satisfaction",
x = "Satisfaction Score",
y = "Count"
) +
theme_minimal()
Neither of the distributions follows a normal distribution, with both nps.score and satisfaction showing left-skewed patterns. Notably, the distribution of nps.score also shows signs of a ceiling effect, with a considerable proportion of scores concentrated at the upper end of the scale.
Given these distributional characteristics, Pearson’s correlation is not an appropriate choice, as it assumes linearity and approximate normality, and can be highly sensitive to skewed or bounded data.
Before selecting a more robust correlation measure—such as Spearman’s rank or polychoric correlation—we will inspect the relationship between nps.score and satisfaction using a scatterplot. This will allow us to visually assess the form and strength of the association and decide whether the relationship appears monotonic or not.
# create the scatterplot (using the package ggplot 2)
ggplot(na.omit(data[, c("nps.score", "satisfaction")]), aes(x = jitter(nps.score), y = jitter(satisfaction))) +
geom_point(alpha = 0.5, color = "#2c3e50") +
labs(
title = "Relation between NPS Scores and Satisfaction",
x = "NPS Score",
y = "Satisfaction"
) +
theme_minimal()
NA
NA
The scatterplot reveals a strong monotonic and linear relationship between NPS score and Satisfaction. Considering that both variables are ordinal, we will use Spearman’s rank correlation, which assesses the strength of monotonic associations without requiring interval-level measurement or normal distribution.
# The Spearman correlation (using the psych package):
with(data, corr.test(nps.score, satisfaction, method="spearman", use= "complete.obs"))
Call:corr.test(x = nps.score, y = satisfaction, use = "complete.obs",
method = "spearman")
Correlation matrix
[1] 0.82
Sample Size
[1] 13443
These are the unadjusted probability values.
The probability values adjusted for multiple tests are in the p.adj object.
[1] 0
To see confidence intervals of the correlations, print with the short=FALSE option
Our analysis revealed a strong positive Spearman correlation between satisfaction and NPS scores, which provides confidence that NPS effectively serves as a proxy for customer satisfaction in this dataset.
##2.2. Heatmaps of Scores Across Methods, Reasons, and Countries
Since we had many levels across contact methods, contact reasons and countries, we wanted to create a heatmap that would allow us and our stakeholders to explore the effects of these factors and their interactions.
###2.2.1. Reusable function to get heatmaps across factors
We first specified a function to obtain these heatmaps for all of our dependent variables, and for any function we wanted to apply (e.g., mean, NPS categorical…).
f.heatmap.3f <- function(my.data, my.dv, factor.columns, factor.rows, factor.blocks, my.function){
# Create the list object to keep the results, and an object to index them (starting with 1)
results <- list()
k <- 1
#I obtain the unique levels of the factors
factor.column.levels <- unique(my.data[[factor.columns]])
factor.rows.levels <- unique(my.data[[factor.rows]])
factor.blocks.levels <- unique(my.data[[factor.blocks]])
# 1. Results for a general block that does not desagregate results by the block factor
#1.1. First line with totals
#it will contain the value for the whole sample, and this total desagregated by the column factor
##1.1.1 Create the label for specifying we refer to results of the whole sample
row.label.t <- paste0("Total all sample")
##1.1.2. Calculate the total results for the whole sample
vector.total <- my.data[[my.dv]] # vector with all the scores
vector.total.clean <- vector.total [!is.na(vector.total)] # vector without NAs
if (length(vector.total.clean) >0){
total <- my.function(vector.total.clean)
} else {stop("The dataset is empty")
}
##1.1.3. Calculate the total results for the columns factor
c.total <- sapply(factor.column.levels, function(c){
vector.c.total <- my.data[ my.data[[factor.columns]] == c, my.dv ]
vector.c.total.clean <- vector.c.total [!is.na(vector.c.total)]
if (length(vector.c.total.clean) > 0){
my.function(vector.c.total.clean)
}else{
NA_real_
}
})
## 1.1.4. Save the results of this line in a named vector
results[[k]] <- c(group = row.label.t, overall = total, setNames(object= c.total, factor.column.levels) )
k <- k+1
# 1.2. Create additional lines for the general block
#each line will shows results of the whole sample for each level of the row factor
#and additional columns for that result desagregated by the colums factor
##1.2.1. Create a loop for each level in the row factor
for (r in factor.rows.levels){
##1.2.2. Create a label for each line
row.label.r <- paste0(r)
##1.2.3. Calculate the total for each level of the row factor
r.total.vector <- my.data[my.data[[factor.rows]]==r, my.dv ]
r.total.vector.clean <- r.total.vector [!is.na(r.total.vector)]
if(length(r.total.vector.clean)>0){
r.total <- my.function(r.total.vector.clean)
} else{
r.total <- NA_real_
}
##1.2.4. Calculate the value for each level in the row factor
rc.crossed <- sapply(factor.column.levels, function(c){
rc.crossed.vector <- my.data[my.data[[factor.rows]] == r &
my.data[[factor.columns]] == c,
my.dv]
rc.crossed.vector.clean <- rc.crossed.vector [!is.na(rc.crossed.vector)]
if(length(rc.crossed.vector.clean)>0){
my.function(rc.crossed.vector.clean)
}else{
NA_real_
}
})
##1.2.5. We save the results
results[[k]] <- c(group = row.label.r, overall = r.total, setNames(object=rc.crossed, factor.column.levels))
k<- k+1
} # Close the loop for the levels of the row factor
# 2.Results for each level of the block factor
#2.1. first line with totals
# it will contain the results for the total of the block and that result desagregated by the column factor
# 2.1.1. We create a loop for each block
for (b in factor.blocks.levels){
## 2.1.2.Label the line to put the result for the block
row.label.b <- paste0("Total: ", b)
## 2.1.3.Calculate the total for the block
b.vector <- my.data[ my.data[[factor.blocks]]== b, my.dv]
b.vector.clean <- b.vector[!is.na(b.vector)]
if(length(b.vector.clean)>0){
b.total <- my.function(b.vector.clean)
}else{
b.total <-NA_real_
}
## 2.1.4.Calculate the results for the block desagregated by the columns factor
bc.crossed <- sapply(factor.column.levels, function(c){
bc.crossed.vector <- my.data [ my.data[[factor.blocks]] == b &
my.data[[factor.columns]] == c,
my.dv]
bc.crossed.vector.clean <- bc.crossed.vector[!is.na(bc.crossed.vector)]
if(length(bc.crossed.vector.clean)>0) {my.function(bc.crossed.vector.clean)
}else{NA_real_}
})
## 2.1.5. Save the results for the block in a named vector
results [[k]] <- c(group = row.label.b, overall= b.total, setNames(object=bc.crossed, factor.column.levels))
k<- k+1
# 2.2. Add the remaining lines corresponding to each block.
#each line will shows the total results for each level of the row factor within that block,
#and that result desagregated by the column factor.
## 2.2.1. Create a sub-loop for rows, within the prior loop of blocks
for(r in factor.rows.levels){
## 2.2.Label each line specifying the block and row crossed
row.label.br <- paste0(b, ": ", r)
## 2.3.Obtain the result for the block and row factors crossed
br.crossed.vector <- my.data[my.data[[factor.blocks]] == b &
my.data[[factor.rows]] == r,
my.dv]
br.crossed.vector.clean <- br.crossed.vector[!is.na(br.crossed.vector)]
if(length(br.crossed.vector.clean)>0){
br.crossed <- my.function (br.crossed.vector.clean)
}else{
br.crossed <- NA_real_
}
## 2.4. Obtain the result for the cross of the three factors
all.crossed <- sapply(factor.column.levels, function(c){
all.crossed.vector <- my.data[my.data[[factor.blocks]] == b &
my.data[[factor.rows]]==r &
my.data[[factor.columns]] == c,
my.dv]
all.crossed.vector.clean <- all.crossed.vector[!is.na(all.crossed.vector)]
if (length(all.crossed.vector.clean) > 0){my.function(all.crossed.vector.clean)}
else {NA_real_} #
})
## 2.5. Save the results of each line in a named vector
results [[k]] <- c(group= row.label.br, overall = br.crossed, setNames(object=all.crossed, factor.column.levels))
k <- k+1
} ###close the loop for the rows factor
} ### close the loop for the blocks factor
# 3. We agregate the results in a dataframe and return the dataframe
results.df <- as.data.frame(do.call(rbind, results), stringsAsFactors = FALSE)
results.df[,-1] <- lapply(results.df[,-1],as.numeric)
return(results.df)
}
To obtain the NPS in categorical terms, we also created a simple reusable function to obtain these scores from any vector of nps raw scores.
f.nps.cat <- function (my.nps.vector, na.rm=TRUE){ # my.nps.vector is a numeric vector of raw NPS scores. It can contain NAs
vector.c <- my.nps.vector[!is.na(my.nps.vector)] # vector clean of NAs
# error messages if sample size = 0 or if there are scores out of the typical range of nps raw scores:
if(length(vector.c) == 0) {stop("error: no available nps raw scores")}
if(length(vector.c[vector.c > 10]) > 0) {stop("error: scores higher than 10 in the nps raw scores")}
if(length(vector.c[vector.c < 0]) > 0) {stop("error: scores lower than 0 in the nps raw scores")}
#warning if sample size is below 70
if(length(vector.c) < 70) {warning("few scores available for calculation (n<70)")}
n <- length(vector.c) # sample size (without missing data)
prop.promoters <- (length(vector.c[vector.c>=9]))/n # proportion of promoters
prop.detractors <- (length(vector.c[vector.c<=6]))/n #proportion of detractors
nps.cat <- (prop.promoters - prop.detractors)*100 # nps score in categorical terms
return (nps.cat) #returns nps in categorical terms
}
#Check if it is working as expected
f.nps.cat(data$nps.score)
[1] -14.9125
with(data, prop.table(table(nps.segment)))
nps.segment
detractor passive promoter
0.446450 0.256225 0.297325
The function is working correctly, we can see that the score we obtain is equal to substracting the percentage of promoters and detractors that we calculated with the table function on the nps.segment variable.
Yet, before getting results, we check if our sample sizes are acceptable and not prone to sampling biases
nps.cat.heatmap.n <- f.heatmap.3f (
my.data = data,
my.dv = "nps.score",
factor.columns = "method",
factor.rows = "reason",
factor.blocks = "country",
my.function = length
)
All sample sizes are high across all cells (n > 150), suggesting little probability of sampling error.
Now we are ready to calculate the heatmap for the NPS categorical using the functions we have prepared.
nps.cat.heatmap <- f.heatmap.3f (
my.data = data,
my.dv = "nps.score",
factor.columns = "method",
factor.rows = "reason",
factor.blocks = "country",
my.function = f.nps.cat #Calculation of nps categorical values
)
# We export it to excel to add colors with conditional formatting
write.xlsx (nps.cat.heatmap, "nps.cat.heatmap.xlsx", colnames = TRUE)
After applying conditional formatting in excel based on colors, we get our heatmap to identify areas in which probability to recommend our customer service is very low:
knitr::include_graphics("Heatmap.nps.cat.png")
The observation of the heatmap suggest us several effects: - Customers in Drinkland have lower probability to promote than Eatland. - Customers contacting for cancellations have lower probability to promote than customers calling for other reasons. - Phone is the contact method associated with highest probability to promote, while emai is associated with the lowest. - Customers contacting through chat for incorrect item deliveries feel very low probability to promote.
Yet, before we proceed to further investigate these effects, we check if we find similar patterns with other measures. The NPS categorical, although have the advantage of being familiar to our stakeholders, has several limitations, such as potentially leading to confusions because of the information lost by the % substraction (it is based on substracting the % of promoters, those who score between 9 and 10, to the % of detractors, those who score between 1 and 6). To illustrate this limitation, a categorical NPS score of 20 can come from infinite distributions of NPS raw scores, including some as different as : - distribution a) 60% of promoters and 20% of detractors - distribution b) 20% of promoters and 0% of detractors.
To make sure the NPS categorical is not leading us to confusing patterns and ambigüities, we check if we obtain similar patterns with the NPS raw score. We apply our premaid heatmap formula to the mean of the NPS raw scores
nps.raw.heatmap <- f.heatmap.3f (
my.data = data,
my.dv = "nps.score",
factor.columns = "method",
factor.rows = "reason",
factor.blocks = "country",
my.function = mean # calculation of the mean of the nps raw scores
)
# Export it to excel to add colors with conditional formatting
write.xlsx (nps.raw.heatmap, "nps.raw.heatmap.xlsx", colnames = TRUE)
# We visualize both heatmaps after adding the colors in excel:
knitr::include_graphics("Heatmap.nps2.png")
NA
We can see a similar pattern for both ways to calculate the results of the NPS, as we described in the section x.
Yet, we still have doubts about the quality of the NPS as a measure, in which customers ask for the probability to promote. To further gain confidence on it, we check its correspondence with the satisfaction scores.
For satisfaction we only had scores for one country (Eatland) and for 2 contact methods (phone and chat). However, these scores are expected to be a more valid measures of satisfaction than the NPS scores, so we will use them to see if they show a different pattern than the NPS scores.
For that, we apply the function we created in the prior section to the satisfaction scores, and we save the obtained dataframe into excel to further clean and include colors in the heatmap.
satisfaction.heatmap <- f.heatmap.3f (
my.data = data,
my.dv = "satisfaction",
factor.columns = "method",
factor.rows = "reason",
factor.blocks = "country",
my.function = mean # calculation of the mean for the satisfaction scores
)
# Export it to excel to add colors with conditional formatting
write.xlsx (satisfaction.heatmap, "satisfaction.heatmap.xlsx", colnames = TRUE)
# We visualize all heatmaps after adding the colors in excel:
knitr::include_graphics("Heatmap.png")
NA
NA
We can see a similar pattern for satisfaction and both ways to calculate the results of the NPS, suggesting that our NPS categorical measure in this context, is a good indicator of satisfaction, and that we can simplify our conversations with stakeholders by focusing on this measure.
Yet, to make sure our sight is not tricking us, we also check how the results are correlated
###2.2.4. Correlation coefficients between NPS measures and satisfaction
To see the strength of the correlations we will calculate Pearson correlations across the three measures for each cell in the heatmap. For that, we first move all the cells of the heatmaps databases in a single column, then apply the correlation.
#Move all cells in the heatmaps to a single column
sat.long <- satisfaction.heatmap %>% pivot_longer(cols = c(-1),
names_to = "evaluation",
values_to = "satisfaction") # results of satisfaction heatmap in a single column
nps.raw.long <- nps.raw.heatmap %>% pivot_longer(cols = c(-1),
names_to = "evaluation",
values_to = "nps.raw") # results of nps raw heatmap in a single column
nps.cat.long <- nps.cat.heatmap %>% pivot_longer(cols = c(-1),
names_to = "evaluation",
values_to = "nps.cat") # results of nps cat heatmap in a single column
#Merge the columns for all measurs in a single dataset
merged.heatmaps <-merge(nps.cat.long, (merge(sat.long, nps.raw.long, by= c("group", "evaluation"))), by =c ("group", "evaluation"))
#Apply Pearson correlations
cor(merged.heatmaps[,3:5], use="complete.obs") #Correlation matrix across the results for the three measures
nps.cat satisfaction nps.raw
nps.cat 1.0000000 0.9571819 0.9854577
satisfaction 0.9571819 1.0000000 0.9850182
nps.raw 0.9854577 0.9850182 1.0000000
All measures appear to be strongly correlated (r > .90), giving us confidence on the NPS categorical being a good indicator of satisfaction and probability to promote. This is a good new because our stakeholders are more familiar with this measure, and we can use its heatmap to discuss effects with them.